{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "from pathlib import Path\n",
    "import json\n",
    "from openai import OpenAI\n",
    "from tqdm import tqdm\n",
    "import re\n",
    "from glob import glob\n",
    "from time import time\n",
    "from openai_batch import OpenAIBatchProcessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_or_update_token(env_file, token_name, token_value):\n",
    "    # Read the existing .env file\n",
    "    env_file_path = Path(env_file)\n",
    "    if not env_file_path.exists():\n",
    "        print(f\"{env_file} does not exist.\")\n",
    "        # create an .env file\n",
    "        env_file_path.touch()\n",
    "        print(f\".env file created at {env_file_path}\")\n",
    "    \n",
    "    # Read lines from the .env file\n",
    "    with open(env_file, 'r') as file:\n",
    "        print(f\"{env_file} exists.\")\n",
    "        lines = file.readlines()\n",
    "    \n",
    "    # Track if the token was updated\n",
    "    token_exists = False\n",
    "    \n",
    "    # Modify the existing token if it exists\n",
    "    for i, line in enumerate(lines):\n",
    "        if line.startswith(f\"{token_name}=\"):\n",
    "            print(f\"{token_name} already exists.\")\n",
    "            lines[i] = f\"{token_name}={token_value}\\n\"\n",
    "            token_exists = True\n",
    "            break\n",
    "    \n",
    "    # If the token does not exist, append it\n",
    "    if not token_exists:\n",
    "        print(f\"Add {token_name}.\")\n",
    "        lines.append(f\"{token_name}={token_value}\\n\")\n",
    "    \n",
    "    # Write the lines back to the .env file\n",
    "    with open(env_file, 'w') as file:\n",
    "        file.writelines(lines)\n",
    "    \n",
    "    print(f\"Token {token_name} has been {'updated' if token_exists else 'added'} successfully.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add_or_update_token('../APIS/.env', 'DEEPSEEK_API_KEY', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(dotenv_path = '../APIS/.env')\n",
    "os.environ[\"OPENAI_API_KEY\"] = os.getenv('OPENAI_API_KEY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('survey_prompt_misinfo.txt', 'r') as f:\n",
    "    survey_prompt = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "message = [{\"role\": \"user\", \"content\": survey_prompt}]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create an instance of the batch processor\n",
    "batch_processor = OpenAIBatchProcessor(\n",
    "    model_name=\"gpt-4o\",  # Specify the OpenAI model\n",
    "    max_completion_tokens=1500,            # Max tokens per response\n",
    "    # temperature=0.5,           # Temperature for response randomness\n",
    "    filename_prefix=\"misinfo0625\"      # Base filename for batch job files\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num = 5000\n",
    "ids = list(range(0, num))\n",
    "messages = message*num"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Create tasks for batch processing\n",
    "# tasks = [batch_processor.create_task(ids=i, messages=messages) for i in range(num)] \n",
    "tasks = batch_processor.create_task(ids, messages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 2: Write the tasks to a file\n",
    "batch_processor.write_task_file(tasks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 500\n",
    "num_files = (len(tasks) + batch_size - 1) // batch_size\n",
    "print(f'Total requests: {len(tasks)}. Batch size: {batch_size}. Separated into {num_files} files.\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Iterate over chunks of data\n",
    "for i in range(num_files):\n",
    "    print(f\"Processing batch {i+1}/{num_files}...\", end=\"\\n\\n\")\n",
    "\n",
    "    start_index = i * batch_size\n",
    "    end_index = min(start_index + batch_size, len(tasks))  # Avoid out-of-range slicing\n",
    "\n",
    "    # Slice the list to get the current batch\n",
    "    batch_data = tasks[start_index:end_index]\n",
    "\n",
    "    # Ensure we are not writing an empty batch\n",
    "    if not batch_data:\n",
    "        print(f\"Skipping batch {i+1} as it's empty.\")\n",
    "        continue\n",
    "\n",
    "    # Generate batch ID using current time\n",
    "    batch_id = int(time())\n",
    "\n",
    "    # Write batch tasks\n",
    "    batch_processor.write_batch_file(batch_data, batch_id)\n",
    "    print(f\"Batch {batch_id} written successfully.\", end=\"\\n\\n\")\n",
    "\n",
    "    # Upload batch file\n",
    "    batch_file = batch_processor.upload_batch_file(batch_id)\n",
    "    if not batch_file:\n",
    "        print(f\"Failed to upload file for batch {batch_id}. Skipping this batch.\")\n",
    "        continue  # Instead of breaking, we skip and move to the next batch\n",
    "\n",
    "    # Create batch job\n",
    "    batch_job = batch_processor.create_batch_job(batch_file)\n",
    "    if not batch_job:\n",
    "        print(f\"Failed to create batch job for batch {batch_id}. Skipping this batch.\")\n",
    "        continue\n",
    "\n",
    "    # Monitor batch job status until completion\n",
    "    final_status = batch_processor.check_batch_job_status(batch_job.id, check_interval=5)\n",
    "    print(f\"Batch job {batch_job.id} finished with status: {final_status}\")\n",
    "\n",
    "    # If batch job failed, log and continue\n",
    "    if final_status in {'failed', 'expired', 'cancelled'}:\n",
    "        print(f\"Batch job {batch_job.id} failed or was cancelled. Moving to next batch.\")\n",
    "        continue\n",
    "\n",
    "print(\"\\nBatch processing completed.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Retrieve the last batch output ID\n",
    "# largest_output_id = batch_processor.check_last_batch_output()\n",
    "output_files = []\n",
    "\n",
    "# print(f\"Starting from last known batch output ID: {largest_output_id}\", end=\"\\n\\n\")\n",
    "\n",
    "# Step 2: Fetch batches from the OpenAI client\n",
    "# Ensure that `specific_batch_id` is set correctly to avoid retrieving unwanted batches.\n",
    "# specific_batch_id = \"your_last_known_batch_id\"  # Replace with the correct batch ID\n",
    "max_batches_to_fetch = 20  # Adjust based on your needs\n",
    "\n",
    "# Retrieve the latest batches\n",
    "for batch in batch_processor.client.batches.list(limit=max_batches_to_fetch):\n",
    "    \n",
    "    if batch.id == 'batch_685b761412008190b545d5aede6f1f0a':\n",
    "        break\n",
    "    print(f\"Batch ID: {batch.id}, Status: {batch.status}\")\n",
    "    \n",
    "\n",
    "    # Collect only completed batches, skipping failed or canceled ones\n",
    "    if batch.status == \"completed\":\n",
    "        batch_id, created_at, output_file_id = batch.id, batch.created_at, batch.output_file_id\n",
    "        output_files.append([batch_id, created_at, output_file_id])\n",
    "\n",
    "# Step 3: Print summary\n",
    "print(f\"Total completed batches retrieved: {len(output_files)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "largest_output_id = -1\n",
    "for i, batch_info in enumerate(output_files[:10]):\n",
    "    batch_id, created_at, output_file_id = batch_info\n",
    "    print(f\"Processing completed batch {batch_id} (Created: {created_at}) with Output File ID: {output_file_id}\")\n",
    "\n",
    "    output_id = i+1+largest_output_id\n",
    "    batch_processor.save_batch_output(output_file_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Organize results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_processor.filename_prefix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "output_files = glob(f'{batch_processor.output_dir}/misinfo*.json')\n",
    "\n",
    "# Iterate over each file in the directory\n",
    "for file_path in tqdm(output_files, total=len(output_files), desc='Processing files'):\n",
    "    # print(f\"Processing file: {file_path}\")\n",
    "    with open(file_path, 'r') as file:\n",
    "        for line in file:\n",
    "            # Parse the JSON string into a dictionary\n",
    "            json_object = json.loads(line.strip())\n",
    "            \n",
    "            # Extract the custom_id\n",
    "            custom_id = json_object.get('custom_id', None)\n",
    "            \n",
    "            # Extract the response body and parse the content to get values\n",
    "            try:\n",
    "                response_content = json.loads(json_object['response']['body']['choices'][0]['message']['content'])\n",
    "                # Append to results\n",
    "                results.append(response_content)\n",
    "            except:\n",
    "                print(json_object['response']['body']['choices'][0]['message']['refusal'])  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = 'results/openai_gpt4o_res_misinfo_0625.json'\n",
    "\n",
    "with open(file_name, \"w\", encoding=\"utf-8\") as json_file:\n",
    "    json.dump(results, json_file, indent=4)\n",
    "\n",
    "print(\"JSON data saved to\", file_name)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
